home *** CD-ROM | disk | FTP | other *** search
- Subject: v08i077: Soundex spelling checker, Part02/02
- Newsgroups: mod.sources
- Approved: mirror!rs
-
- Submitted by: Barry Brachman <brachman@cs.ubc.cdn>
- Mod.sources: Volume 8, Issue 77
- Archive-name: sp/Part02
-
- #! /bin/sh
- # This is a shell archive. Remove anything before this line,
- # then unpack it by saving it in a file and typing "sh file".
- # If all goes well, you will see the message "End of archive 2 (of 2)."
- # Contents: sp.9 sp.c sp.h sp.ml
- PATH=/bin:/usr/bin:/usr/ucb; export PATH
- echo shar: extracting "'sp.9'" '(1112 characters)'
- if test -f 'sp.9' ; then
- echo shar: will not over-write existing file "'sp.9'"
- else
- sed 's/^X//' >sp.9 <<'@//E*O*F sp.9//'
- X.TH SP 9-EMACS "8-Aug-1985"
- X.UC
- X.SH NAME
- Xsp, sp-from-buffer \- give possible spellings
- X.br
- Xget-next-word \- return word at cursor
- X.SH SYNOPSIS
- X.B (sp)
- X.br
- X.B (sp-from-buffer)
- X.br
- X.B (get-next-word)
- X.SH DESCRIPTION
- X.I Sp
- Xprompts for a word and then invokes the
- X.B sp
- Xprogram with the word as an argument.
- XThe output of the
- X.B sp
- Xprogram, a list of words, is placed in a buffer called "sp".
- XIf the word is found, then the cursor is positioned at the word.
- XIf the word isn't found, the cursor is left at the start of the buffer.
- XIn either case the mode line of the buffer indicates whether the word
- Xwas found.
- X.sp 2
- X.B Sp-from-buffer
- Xis the same as
- X.B sp
- Xexcept that the user is not prompted for the word; the word under or
- Ximmediately to the left of the cursor is used.
- X.sp 2
- X.B Get-next-word
- Xreturns the word the cursor is pointing at or the word immediately
- Xto the left of the cursor if it is between words.
- X.SH BUFFERS USED
- Xsp \- result
- X.SH FILES
- X/usr/local/lib/emacs/sp.ml
- X.br
- X/usr/local/sp
- X.SH SEE ALSO
- X.B sp(1-LOCAL)
- X.SH AUTHOR
- XBarry Brachman
- X.br
- XDept. of Computer Science
- X.br
- XUniversity of British Columbia
- @//E*O*F sp.9//
- if test 1112 -ne "`wc -c <'sp.9'`"; then
- echo shar: error transmitting "'sp.9'" '(should have been 1112 characters)'
- fi
- fi # end of overwriting check
- echo shar: extracting "'sp.c'" '(9128 characters)'
- if test -f 'sp.c' ; then
- echo shar: will not over-write existing file "'sp.c'"
- else
- sed 's/^X//' >sp.c <<'@//E*O*F sp.c//'
- X/* vi: set tabstop=4 : */
- X
- X/*
- X * Version 1.3 December 1986
- X *
- X * sp - spell word
- X *
- X * Usage: sp [-f dictionary-list] [-eavc] [word ...]
- X *
- X * Compute the Soundex code for each word on the command line
- X * (or each word on the standard input) and compare against a
- X * dictionary
- X *
- X * The soundex dictionary list may be specified on the command line
- X * The environment variable SPPATH may be set to a list of colon
- X * separated pathnames of soundex dictionaries.
- X * If a command line dictionary-list (a colon separated list of pathnames) is
- X * given in addition to the SPPATH variable, all dictionaries are used.
- X *
- X * To reduce the size of the word list, certain heuristics are used:
- X * the -a option causes all words matched to be printed
- X * The output is alphabetically sorted and indicators are printed
- X * beside each word:
- X * X == exact match
- X * ! == close match
- X * * == near match
- X * ' ' == matched
- X *
- X * Note that the maximum number of colliding words is MAXCOUNT due to the
- X * data structure used.
- X *
- X * Permission is given to copy or distribute this program provided you
- X * do not remove this header or make money off of the program.
- X *
- X * Please send comments and suggestions to:
- X *
- X * Barry Brachman
- X * Dept. of Computer Science
- X * Univ. of British Columbia
- X * Vancouver, B.C. V6T 1W5
- X *
- X * .. {ihnp4!alberta, uw-beaver}!ubc-vision!ubc-cs!brachman
- X * brachman@cs.ubc.cdn
- X * brachman%ubc.csnet@csnet-relay.arpa
- X * brachman@ubc.csnet
- X */
- X
- X#include <sys/types.h>
- X#include <sys/file.h>
- X#include <ctype.h>
- X#include <stdio.h>
- X
- X#ifdef NEWDBM
- X#include <ndbm.h>
- X#else !NEWDBM
- X#include <dbm.h>
- X#endif NEWDBM
- X
- X#include "sp.h"
- X
- X#define streq(X, Y) (!strcmp(X, Y))
- X#define range(S) ((strlen(S) + 4) / 5)
- X
- X#define USAGE "Usage: sp [-f dictionary-list] [-eavc] [word ...]"
- X
- Xchar word[MAXWORDLEN + 2];
- X
- Xdatum FETCH();
- X
- Xchar *fileptr[MAXDICT + 1]; /* Up to MAXDICT dictionaries + sentinel */
- Xint dict_ptr = 0;
- X
- Xchar *wordptr[MAXWORDS], *wordlistptr;
- Xchar wordlist[WORDSPACE];
- Xint nmatched;
- X
- X/*
- X * Soundex codes
- X * The program depends upon the numbers zero through six being used
- X * but this can easily be changed
- X */
- Xchar soundex_code_map[26] = {
- X/*** A B C D E F G H I J K L M N O P ***/
- X 0, 1, 2, 3, 0, 1, 2, 0, 0, 2, 2, 4, 5, 5, 0, 1,
- X
- X/*** Q R S T U V W X Y Z ***/
- X 2, 6, 2, 3, 0, 1, 0, 2, 0, 2
- X};
- X
- Xint aflag, cflag, eflag, vflag;
- X
- Xmain(argc, argv)
- Xint argc;
- Xchar **argv;
- X{
- X register int fflag, i;
- X register char *p;
- X char *getenv();
- X
- X argc--; argv++;
- X fileptr[0] = (char *) NULL;
- X while (argc > 0 && argv[0][0] == '-') {
- X fflag = 0; /* to break out of following loop... */
- X for (i = 1; argv[0][i] != '\0' && fflag == 0; i++) {
- X switch (argv[0][i]) {
- X case 'a':
- X aflag = 1;
- X break;
- X case 'c':
- X cflag = 1;
- X break;
- X case 'e':
- X eflag = 1;
- X break;
- X case 'f':
- X if (argc == 1) {
- X fprintf(stderr, "%s\n", USAGE);
- X exit(1);
- X }
- X mkfilelist(argv[1]);
- X argc--;
- X argv++;
- X fflag = 1; /* break out of loop */
- X break;
- X case 'v':
- X vflag = 1;
- X break;
- X default:
- X fprintf(stderr, "%s\n", USAGE);
- X exit(1);
- X }
- X }
- X argc--, argv++;
- X }
- X
- X if ((p = getenv("SPPATH")) != (char *) NULL)
- X mkfilelist(p);
- X if (fileptr[0] == (char *) NULL)
- X mkfilelist(DEFAULT_SPPATH);
- X if (vflag) {
- X printf("Using dictionaries:\n");
- X for (i = 0; fileptr[i] != (char *) NULL; i++)
- X if (strlen(fileptr[i]) > 0)
- X printf("\t%s\n", fileptr[i]);
- X }
- X if (argc) {
- X for (i = 0; i < argc; i++) {
- X if (!eflag)
- X printf("%s:\n", argv[i]);
- X apply(argv[i]);
- X if (!eflag)
- X printf("\n");
- X }
- X }
- X else {
- X int ch, len;
- X
- X while (1) {
- X printf("Word? ");
- X if (fgets(word, sizeof(word), stdin) == (char *) NULL) {
- X printf("\n");
- X break;
- X }
- X len = strlen(word);
- X if (word[len - 1] != '\n') {
- X fprintf(stderr, "sp: Word too long: %s", word);
- X while ((ch = getchar()) != '\n') /* flush rest of line */
- X putc(ch, stderr);
- X putc('\n', stderr);
- X continue;
- X }
- X word[--len] = '\0';
- X if (len > MAXWORDLEN) {
- X fprintf(stderr, "sp: Word too long: %s\n", word);
- X continue;
- X }
- X
- X apply(word);
- X if (!eflag)
- X printf("\n");
- X }
- X }
- X}
- X
- X/*
- X * Apply the Soundex search for a word to each dictionary in turn
- X * Note that 'DBMINIT' opens both the '.dir' and the '.pag' files
- X * and we must close them to avoid running out of file descriptors
- X *
- X * This routine gets called each time a word is looked up and therefore
- X * the dbm files may be repeatedly opened and closed. Since the vast majority
- X * of the time this program is invoked for just a single word it doesn't seem
- X * worthwhile to do the right thing by saving file descriptors/DBM pointers.
- X * There probably won't be more than two dictionaries in use anyway.
- X */
- Xapply(word)
- Xchar *word;
- X{
- X register int code, i, nodicts;
- X
- X nmatched = 0;
- X wordlistptr = wordlist;
- X if ((code = soundex(word, 3)) == BAD_WORD)
- X return;
- X nodicts = 1;
- X for (i = 0; fileptr[i] != (char *) NULL; i++) {
- X if (strlen(fileptr[i]) == 0)
- X continue;
- X if (DBMINIT(fileptr[i], O_RDONLY) != -1) {
- X proc(code);
- X nodicts = 0;
- X }
- X DBMCLOSE();
- X }
- X if (nodicts) {
- X fprintf(stderr, "sp: Can't open any dictionaries\n");
- X exit(1);
- X }
- X if (vflag && !eflag && nmatched == 0)
- X printf("%s: no match\n", word);
- X else
- X choose(word);
- X}
- X
- X/*
- X * Look the word up in the current dictionary
- X * and save all the matches
- X * Note that only three digits are of the Soundex code are stored
- X * in a dictionary
- X */
- Xproc(soundex)
- Xint soundex;
- X{
- X register int c, len;
- X datum dbm_key, dbm_content;
- X key_t *key, keyvec[KEYSIZE];
- X char *mk_word(), *p;
- X
- X key = keyvec;
- X dbm_key.dptr = (char *) key;
- X dbm_key.dsize = KEYSIZE;
- X c = 0;
- X while (1) {
- X mk_key(key, soundex, c);
- X dbm_content = FETCH(dbm_key);
- X
- X if (dbm_content.dptr == 0)
- X break;
- X
- X if (IS_DELETED(dbm_content)) {
- X if (++c > MAXCOUNT) {
- X fprintf(stderr, "sp: entry count overflow\n");
- X exit(1);
- X }
- X continue;
- X }
- X
- X if (nmatched == MAXWORDS) {
- X fprintf(stderr, "sp: Too many matches\n");
- X exit(1);
- X }
- X
- X p = mk_word(dbm_content.dptr, dbm_content.dsize, soundex);
- X len = strlen(p);
- X if (wordlistptr + len >= &wordlist[WORDSPACE]) {
- X fprintf(stderr, "sp: Out of space for words\n");
- X exit(1);
- X }
- X strncpy(wordlistptr, p, len);
- X wordlistptr[len] = '\0';
- X wordptr[nmatched++] = wordlistptr;
- X wordlistptr += len + 1;
- X if (++c > MAXCOUNT) {
- X fprintf(stderr, "sp: entry count overflow\n");
- X exit(1);
- X }
- X }
- X}
- X
- X/*
- X * Select and print those words which we consider
- X * to have matched 'word'
- X */
- Xchoose(word)
- Xregister char *word;
- X{
- X register int c, code, i, len, mcount, wordlen;
- X register char *p;
- X int compar();
- X
- X code = soundex(word, 4);
- X qsort(wordptr, nmatched, sizeof(char *), compar);
- X c = range(word);
- X wordlen = strlen(word);
- X mcount = 0;
- X for (i = 0; i < nmatched; i++) {
- X p = wordptr[i];
- X if (strmatch(word, p) == 0) {
- X printf("X");
- X if (eflag) {
- X printf(" %s\n", word);
- X return;
- X }
- X }
- X else if (eflag)
- X continue;
- X else if (soundex(p, 4) == code)
- X printf("!");
- X else if (aflag &&
- X (wordlen < (len = strlen(p)) - c || len > wordlen + c))
- X printf(" ");
- X else if (!cflag)
- X printf("*");
- X else
- X continue;
- X printf("%3d. %s\n", mcount + 1, p);
- X mcount++;
- X }
- X if (vflag)
- X printf("(%d total matches)\n", nmatched);
- X}
- X
- X/*
- X * Compute an 'n' digit Soundex code for 'word'
- X * See mksp.c
- X */
- Xsoundex(word, n)
- Xregister char *word;
- Xint n;
- X{
- X register int c, digit_part, previous_code, soundex_length;
- X register char *p, *w;
- X char wcopy[MAXWORDLEN + 2];
- X
- X strcpy(wcopy, word);
- X p = w = wcopy;
- X while (*p != '\0') {
- X if (isupper(*p))
- X *p = tolower(*p);
- X p++;
- X }
- X if (!isalpha(*w)) {
- X fprintf(stderr, "sp: Improper word: %s\n", word);
- X return(BAD_WORD);
- X }
- X digit_part = 0;
- X soundex_length = 0;
- X previous_code = soundex_code_map[*w - 'a'];
- X for (p = w + 1; *p != '\0' && soundex_length < n; p++) {
- X if (!isalpha(*p))
- X continue;
- X c = soundex_code_map[*p - 'a'];
- X if (c == 0 || previous_code == c) {
- X previous_code = c;
- X continue;
- X }
- X digit_part = digit_part * 7 + c;
- X previous_code = c;
- X soundex_length++;
- X }
- X while (soundex_length++ < n)
- X digit_part *= 7;
- X return((digit_part << 5) + *w - 'a');
- X}
- X
- X/*
- X * Process a path string (environment variable SPPATH, DEFAULT_SPPATH, or an
- X * arg) by separating the pathnames into strings pointed to by elements
- X * of 'fileptr'
- X * End of list indicated by fileptr entry of NULL
- X *
- X * No attempt made to ignore duplicate pathnames
- X */
- Xmkfilelist(p)
- Xregister char *p;
- X{
- X register int len;
- X register char *path, *start;
- X char *malloc();
- X
- X while (*p != '\0' && dict_ptr < MAXDICT) {
- X start = p;
- X while (*p != ':' && *p != '\0')
- X p++;
- X if (start == p && *p == ':') { /* colon with nothing else */
- X p++;
- X continue;
- X }
- X len = p - start;
- X path = (char *) malloc((unsigned) (len + 1));
- X if (path == (char *) NULL) {
- X fprintf(stderr, "sp: Out of dictionary space\n");
- X exit(1);
- X }
- X strncpy(path, start, len);
- X path[len] = '\0';
- X fileptr[dict_ptr++] = path;
- X }
- X fileptr[dict_ptr] = (char *) NULL;
- X}
- X
- Xcompar(p, q)
- Xchar **p, **q;
- X{
- X
- X return(strmatch(*p, *q));
- X/* return(strcmp(*p, *q)); */ /* use if you prefer case sensitive */
- X}
- X
- @//E*O*F sp.c//
- if test 9128 -ne "`wc -c <'sp.c'`"; then
- echo shar: error transmitting "'sp.c'" '(should have been 9128 characters)'
- fi
- fi # end of overwriting check
- echo shar: extracting "'sp.h'" '(3553 characters)'
- if test -f 'sp.h' ; then
- echo shar: will not over-write existing file "'sp.h'"
- else
- sed 's/^X//' >sp.h <<'@//E*O*F sp.h//'
- X/* sp.h */
- X/* vi: set tabstop=4 : */
- X
- X/*
- X * A deleted dbm entry is denoted by a dsize of zero
- X */
- X#define IS_DELETED(C) (C.dsize == 0)
- X
- X/*
- X * Because the soundex code (part of the key) includes the first character of
- X * the word, we don't need to store the first character again with the content.
- X * To do this we treat the first byte of the content stored in the dbm
- X * specially: we rip off the two high order bits of the first byte of
- X * the content and therefore have to restrict the value of the second
- X * character of the word. We use 'a' == 0, 'z' == 25, 'A' == 26, 'Z' == 51.
- X * See spchar_map[] (misc.c) for the mapping of codes 52 through 63.
- X * This behaviour is isolated in tospchar() and fromspchar().
- X * If spchar_map is changed you should change the man page too.
- X *
- X * The word can be reconstructed by extracting the first character of the word
- X * from the soundex code and then looking at the first byte of the content.
- X * If the UPPER_CHAR bit is on in the first byte of the content then the first
- X * character of the word should be upper case.
- X * The length of the content reflects the actual number of bytes stored in the
- X * dbm. Words that have been deleted from the dbm are stored with a length of
- X * zero. Because of this, words of length 1 are treated differently: they are
- X * stored with a length of 1 and with the SINGLE_CHAR bit set. Words with
- X * original length > 1 will have (length - 1) bytes stored in the content.
- X * Clear?
- X */
- X#define IS_VALID(w) (isalpha(*w) && (*(w+1) == '\0' || isalpha(*(w+1)) \
- X || tospchar(*(w+1)) != '\0'))
- X#define UPPER_CHAR 0200 /* 1st char of word is upper case */
- X#define SINGLE_CHAR 0100 /* single char word */
- X#define MASK_CHAR 0077 /* mask out the indicator bits */
- X#define QUOTE_CHAR 0064 /* (52) code for single quote */
- X#define AMPER_CHAR 0065 /* (53) for ampersand */
- X#define PERIOD_CHAR 0066 /* (54) for period */
- X#define SPACE_CHAR 0067 /* (55) for blank */
- X
- X/*
- X * Map for first byte of dbm content (special characters)
- X * Terminated by a null entry
- X */
- Xstruct spchar_map {
- X char spchar;
- X char code;
- X};
- X
- X#define MAXDICT 10 /* Max number of dictionaries to use */
- X#define MAXWORDLEN 50 /* Max word length */
- X#define MAXWORDS 400 /* Max number of words in one sp query */
- X#define WORDSPACE 20480 /* Max space used words for one sp query */
- X
- X/*
- X * This is the default path used by sp to find dictionaries
- X * Adjust for local conditions
- X */
- X#define DEFAULT_SPPATH "/usr/local/lib/sp.dict.1:/usr/local/lib/sp.dict.2"
- X
- X/*
- X * The following must be the maximum value containable in the count part of
- X * a key.
- X * It must be always be less than: (the maximum positive value that can be
- X * contained in an int) - 1
- X * This value imposes a limit on the number of words in a dictionary having the
- X * same soundex code. For /usr/dict/words (~25K words), a count of 255 is
- X * sufficient. Larger dictionaries will need more. In any case you can
- X * always just make another dictionary and split up your words.
- X * You might want to adjust MAXWORDS and WORDSPACE (above) to reflect MAXCOUNT
- X * if you've got plenty of memory.
- X */
- X#define MAXCOUNT 1023 /* 2^10 - 1 */
- X
- X/*
- X * The key used by dbm looks like this:
- X *
- X * <10 bits> <5 bits> <9 bits>
- X * counter first char soundex
- X *
- X * A soundex value is treated as a base 7 number (maximum is 666, base 7).
- X */
- X#define KEYSIZE 3 /* in bytes */
- Xtypedef unsigned char key_t;
- X
- X#define BAD_WORD -1 /* This must be an illegal Soundex */
- X#define NO_MATCH 0
- X#define MATCHED 1
- X
- Xextern char soundex_code_map[];
- X
- @//E*O*F sp.h//
- if test 3553 -ne "`wc -c <'sp.h'`"; then
- echo shar: error transmitting "'sp.h'" '(should have been 3553 characters)'
- fi
- fi # end of overwriting check
- echo shar: extracting "'sp.ml'" '(1613 characters)'
- if test -f 'sp.ml' ; then
- echo shar: will not over-write existing file "'sp.ml'"
- else
- sed 's/^X//' >sp.ml <<'@//E*O*F sp.ml//'
- X;
- X; Written by Donald Acton and Barry Brachman with help from Marc Majka
- X; March 11/85
- X;
- X; Dept. of Computer Science
- X; University of British Columbia
- X;
- X
- X(defun
- X (sp spword
- X (setq spword (get-tty-string "Word: " ))
- X (do-sp spword)
- X )
- X)
- X
- X(defun ; bjb
- X (sp-from-buffer spword
- X (setq spword (get-next-word))
- X (message (concat "Word: " spword))
- X (sit-for 0)
- X (do-sp spword)
- X )
- X)
- X
- X(defun
- X (do-sp curr found tmp
- X (setq curr (current-buffer-name))
- X (pop-to-buffer "sp")
- X (setq needs-checkpointing 0)
- X (erase-buffer)
- X (set-mark)
- X (filter-region (concat "sp " spword))
- X (beginning-of-file)
- X (setq found " *not found*")
- X (setq tmp case-fold-search)
- X (setq case-fold-search 1)
- X (error-occurred (re-search-forward (concat "\. " spword "$"))
- X (beginning-of-line)
- X (line-to-top-of-window)
- X (setq found " *found*"))
- X (setq case-fold-search tmp)
- X (setq mode-line-format " %b - %m %p")
- X (setq mode-string (concat spword found))
- X (pop-to-buffer curr)
- X (novalue)))
- X
- X;
- X; Return the word the cursor is pointing at
- X; or the word immediately to the left of the
- X; cursor if it is between words
- X;
- X; April 15/85 - bjb
- X;
- X(defun
- X (get-next-word original-dot rb spword
- X (save-excursion
- X (setq original-dot (dot))
- X (backward-word) (forward-word) (setq rb (dot))
- X (if (> original-dot rb)
- X (progn (forward-word) (backward-word))
- X (backward-word))
- X (set-mark)
- X (forward-word)
- X (setq spword (region-to-string))
- X )
- X spword
- X )
- X)
- X
- @//E*O*F sp.ml//
- if test 1613 -ne "`wc -c <'sp.ml'`"; then
- echo shar: error transmitting "'sp.ml'" '(should have been 1613 characters)'
- fi
- fi # end of overwriting check
- echo shar: "End of archive 2 (of 2)."
- cp /dev/null ark2isdone
- DONE=true
- for I in 1 2; do
- if test ! -f ark${I}isdone; then
- echo "You still need to run archive ${I}."
- DONE=false
- fi
- done
- case $DONE in
- true)
- echo "You have run both archives."
- echo 'See the README'
- ;;
- esac
- ## End of shell archive.
- exit 0
-